{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pathlib\n",
    "import pickle\n",
    "import jieba\n",
    "import pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# feature_extraction.text 中的常用组件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "raw = ['我们 的 祖国 是 花园 ， 花园 里 花朵 真 鲜艳 。','我 是 共产主义 接班人']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vocb = list(set(raw[0].split(' ') + raw[1].split(' ')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CountVectorizer 将篇章转化为词频的矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cv = CountVectorizer(vocabulary=vocb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0],\n",
       "        [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.fit_transform(raw).todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TfidfVectorizer 将篇章转化为tfidf矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tf = TfidfVectorizer(vocabulary=vocb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[ 0.35355339,  0.        ,  0.35355339,  0.70710678,  0.35355339,\n",
       "          0.        ,  0.        ,  0.        ,  0.        ,  0.35355339,\n",
       "          0.        ,  0.        ,  0.        ,  0.        ],\n",
       "        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "          0.        ,  0.        ,  0.        ,  0.70710678,  0.        ,\n",
       "          0.70710678,  0.        ,  0.        ,  0.        ]])"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tf.fit_transform(raw).todense()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TifdfTransformer 将词频矩阵转化为tfidf矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tft = TfidfTransformer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "matrix([[ 0.35355339,  0.        ,  0.35355339,  0.70710678,  0.35355339,\n",
       "          0.        ,  0.        ,  0.        ,  0.        ,  0.35355339,\n",
       "          0.        ,  0.        ,  0.        ,  0.        ],\n",
       "        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
       "          0.        ,  0.        ,  0.        ,  0.70710678,  0.        ,\n",
       "          0.70710678,  0.        ,  0.        ,  0.        ]])"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tft.fit_transform(cv.fit_transform(raw)).todense()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
